library(quanteda)
## Package version: 2.1.2
## Parallel computing: 2 of 6 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.1
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Read in data, cast as tibble
pitchers <- read_csv("stats.csv")
## Warning: Missing column names filled in: 'X26' [26]
## Parsed with column specification:
## cols(
## .default = col_double(),
## last_name = col_character(),
## first_name = col_character(),
## X26 = col_logical()
## )
## See spec(...) for full column specifications.
pitchers <- tibble(pitchers)
pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))
pitchers <- pitchers[order(pitchers$player_age),]
pitchers <- pitchers[order(pitchers$name),]
#Aggregate by age and find the mean OBP for each age
pitchers_aggregated_by_age <- pitchers %>%
group_by(player_age) %>%
summarise_at(vars(on_base_percent),
list(OBP = mean))
#The original idea was to graph the pitcher data by age, but we ran into a problem. Obviously if your OBP jumps too much, you get booted. So the average was relatively stable over time, except on the old end.
ggplot(pitchers_aggregated_by_age, aes(player_age, OBP, fill = "green")) +
geom_bar(stat = "identity", show.legend = FALSE) +
ggtitle("OBP by age") +
theme(plot.title = element_text(hjust = 0.5))

#The remedy is to find the average OBP by "year in the league".
#Add an empty column that we can populate in the next chunk
emptycolumn <- c("Years_In_Pitching")
pitchers[ , emptycolumn] <- NA
pitchers$Years_In_Pitching[1]<-1
i<-2
while (i < length(pitchers$Years_In_Pitching)){
if(i==1){
pitchers$Years_In_Pitching[i]=1
i=i+1
}
else if(pitchers$name[i]==pitchers$name[i-1]){
pitchers$Years_In_Pitching[i]=pitchers$Years_In_Pitching[i-1]+1
i=i+1
}
else if(pitchers$name[i]!=pitchers$name[i-1]){
pitchers$Years_In_Pitching[i] = 1
i=i+1
}
}
#Aggregate by Years In Pitching and find the mean OBP for each of these
pitchers_by_years_pitched <- pitchers %>%
group_by(Years_In_Pitching) %>%
summarise_at(vars(on_base_percent),
list(OBP = mean))
ggplot(pitchers_by_years_pitched, aes(Years_In_Pitching, OBP, fill = "green")) +
geom_bar(stat = "identity", show.legend = FALSE) +
ggtitle("OBP by Years in Pitching") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 rows containing missing values (position_stack).

#Look at the trajectories of several careers.
AndyBenes <- pitchers %>% filter(name == "Andy Benes")
ggplot(AndyBenes, aes(Years_In_Pitching, on_base_percent, fill = "green")) +
geom_bar(stat = "identity", show.legend = FALSE) +
ggtitle("OBP by Years in Pitching") +
theme(plot.title = element_text(hjust = 0.5))

AJBurnett <- pitchers %>% filter(name == "A.J. Burnett")
ggplot(AJBurnett, aes(Years_In_Pitching, on_base_percent, fill = "green")) +
geom_bar(stat = "identity", show.legend = FALSE) +
ggtitle("OBP by Years in Pitching") +
theme(plot.title = element_text(hjust = 0.5))

AdamWainwright <- pitchers %>% filter(name== "Adam Wainwright")
ggplot(AdamWainwright, aes(Years_In_Pitching, on_base_percent, fill = "green")) +
geom_bar(stat = "identity", show.legend = FALSE) +
ggtitle("OBP by Years in Pitching") +
theme(plot.title = element_text(hjust = 0.5))

AndyPettitte <- pitchers %>% filter(name== "Andy Pettitte")
ggplot(AndyPettitte, aes(Years_In_Pitching, on_base_percent, fill = "green")) +
geom_bar(stat = "identity", show.legend = FALSE) +
ggtitle("OBP by Years in Pitching") +
theme(plot.title = element_text(hjust = 0.5))

#Curious about the effect of age AND years in pitching on OBP
mylm <- lm(on_base_percent ~ player_age * Years_In_Pitching, data = pitchers)
#Analyze Findings
summary(mylm)
##
## Call:
## lm(formula = on_base_percent ~ player_age * Years_In_Pitching,
## data = pitchers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.096158 -0.016611 0.000262 0.017500 0.079295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.191e-01 4.786e-03 66.681 < 2e-16 ***
## player_age -4.773e-05 1.702e-04 -0.280 0.779197
## Years_In_Pitching -3.686e-03 9.632e-04 -3.827 0.000132 ***
## player_age:Years_In_Pitching 8.487e-05 2.828e-05 3.001 0.002710 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02552 on 3320 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.009134, Adjusted R-squared: 0.008238
## F-statistic: 10.2 on 3 and 3320 DF, p-value: 1.1e-06
summary(mylm)$r.squared
## [1] 0.009133655
pitchers <- read_csv("stats.csv")
## Warning: Missing column names filled in: 'X26' [26]
## Parsed with column specification:
## cols(
## .default = col_double(),
## last_name = col_character(),
## first_name = col_character(),
## X26 = col_logical()
## )
## See spec(...) for full column specifications.
pitchers <- tibble(pitchers)
pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))
pitchers <- pitchers[order(pitchers$player_age),]
pitchers <- pitchers[order(pitchers$name),]
pitchers <- pitchers %>% mutate(decade = (year %/% 10)*10)
pitchers
## # A tibble: 3,325 x 28
## last_name first_name year player_age p_game p_formatted_ip p_total_pa p_ab
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Burnett A.J. 2001 24 27 173. 733 629
## 2 Burnett A.J. 2002 25 31 204. 844 732
## 3 Burnett A.J. 2005 28 32 209 873 775
## 4 Burnett A.J. 2007 30 25 165. 691 611
## 5 Burnett A.J. 2008 31 35 221. 957 849
## 6 Burnett A.J. 2009 32 33 207 896 781
## 7 Burnett A.J. 2010 33 33 186. 829 715
## 8 Burnett A.J. 2011 34 33 190. 837 731
## 9 Burnett A.J. 2012 35 31 202. 851 767
## 10 Burnett A.J. 2013 36 30 191 801 714
## # ... with 3,315 more rows, and 20 more variables: p_total_hits <dbl>,
## # p_single <dbl>, p_double <dbl>, p_triple <dbl>, p_home_run <dbl>,
## # p_strikeout <dbl>, p_walk <dbl>, p_k_percent <dbl>, p_bb_percent <dbl>,
## # batting_avg <dbl>, slg_percent <dbl>, on_base_percent <dbl>,
## # on_base_plus_slg <dbl>, isolated_power <dbl>, p_earned_run <dbl>,
## # p_run <dbl>, p_balk <dbl>, X26 <lgl>, name <chr>, decade <dbl>
#devtools::install_github("bokeh/rbokeh@v0.6.3")
library(rbokeh)
##
## Attaching package: 'rbokeh'
## The following object is masked from 'package:readr':
##
## spec
## The following object is masked from 'package:ggplot2':
##
## arrow
library(quanteda)
library(tidyverse)
#Read in data, cast as tibble
pitchers <- read_csv("stats.csv")
## Warning: Missing column names filled in: 'X26' [26]
## Parsed with column specification:
## cols(
## .default = col_double(),
## last_name = col_character(),
## first_name = col_character(),
## X26 = col_logical()
## )
## See spec(...) for full column specifications.
pitchers <- tibble(pitchers)
pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))
pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))
pitchers <- pitchers[order(pitchers$player_age),]
pitchers <- pitchers[order(pitchers$name),]
#Add an empty column that we can populate in the next chunk
emptycolumn <- c("Years_In_Pitching")
pitchers[ , emptycolumn] <- NA
pitchers$Years_In_Pitching[1]<-1
i<-2
while (i < length(pitchers$Years_In_Pitching)){
if(i==1){
pitchers$Years_In_Pitching[i]=1
i=i+1
}
else if(pitchers$name[i]==pitchers$name[i-1]){
pitchers$Years_In_Pitching[i]=pitchers$Years_In_Pitching[i-1]+1
i=i+1
}
else if(pitchers$name[i]!=pitchers$name[i-1]){
pitchers$Years_In_Pitching[i] = 1
i=i+1
}
}
pitchers <- pitchers %>% mutate(decade = (year %/% 10)*10)
pitchers
## # A tibble: 3,325 x 29
## last_name first_name year player_age p_game p_formatted_ip p_total_pa p_ab
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Burnett A.J. 2001 24 27 173. 733 629
## 2 Burnett A.J. 2002 25 31 204. 844 732
## 3 Burnett A.J. 2005 28 32 209 873 775
## 4 Burnett A.J. 2007 30 25 165. 691 611
## 5 Burnett A.J. 2008 31 35 221. 957 849
## 6 Burnett A.J. 2009 32 33 207 896 781
## 7 Burnett A.J. 2010 33 33 186. 829 715
## 8 Burnett A.J. 2011 34 33 190. 837 731
## 9 Burnett A.J. 2012 35 31 202. 851 767
## 10 Burnett A.J. 2013 36 30 191 801 714
## # ... with 3,315 more rows, and 21 more variables: p_total_hits <dbl>,
## # p_single <dbl>, p_double <dbl>, p_triple <dbl>, p_home_run <dbl>,
## # p_strikeout <dbl>, p_walk <dbl>, p_k_percent <dbl>, p_bb_percent <dbl>,
## # batting_avg <dbl>, slg_percent <dbl>, on_base_percent <dbl>,
## # on_base_plus_slg <dbl>, isolated_power <dbl>, p_earned_run <dbl>,
## # p_run <dbl>, p_balk <dbl>, X26 <lgl>, name <chr>, Years_In_Pitching <dbl>,
## # decade <dbl>
figure() %>%
ly_points(x = on_base_percent, y = p_strikeout, color = Years_In_Pitching,
data = pitchers, hover = list(name, year, on_base_percent, p_strikeout))
## Warning: `lang_args()` is deprecated as of rlang 0.2.0.
## Please use `call_args()` instead.
## This warning is displayed once per session.
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
#install.packages("ggpointdensity")
library(ggpointdensity)
## Warning: package 'ggpointdensity' was built under R version 4.0.3
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- ggplot(pitchers, aes(on_base_percent, p_strikeout, colour=decade))
ggplotly(p)
#install.packages("WVPlots")
#install.packages("GGally")
library(WVPlots)
## Warning: package 'WVPlots' was built under R version 4.0.3
## Loading required package: wrapr
## Warning: package 'wrapr' was built under R version 4.0.3
##
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
##
## coalesce
## The following object is masked from 'package:tidyr':
##
## unpack
## The following object is masked from 'package:tibble':
##
## view
library(quanteda)
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyverse)
pitchers <- read_csv("stats.csv")
## Warning: Missing column names filled in: 'X26' [26]
## Parsed with column specification:
## cols(
## .default = col_double(),
## last_name = col_character(),
## first_name = col_character(),
## X26 = col_logical()
## )
## See spec(...) for full column specifications.
pitchers <- tibble(pitchers)
pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))
pitchers <- pitchers[order(pitchers$player_age),]
pitchers <- pitchers[order(pitchers$name),]
pitchers <- pitchers %>% mutate(decade = (year %/% 10)*10)
pitchers <- pitchers %>% mutate(sdecade = format(decade, digits = 4))
colnames(pitchers)
## [1] "last_name" "first_name" "year" "player_age"
## [5] "p_game" "p_formatted_ip" "p_total_pa" "p_ab"
## [9] "p_total_hits" "p_single" "p_double" "p_triple"
## [13] "p_home_run" "p_strikeout" "p_walk" "p_k_percent"
## [17] "p_bb_percent" "batting_avg" "slg_percent" "on_base_percent"
## [21] "on_base_plus_slg" "isolated_power" "p_earned_run" "p_run"
## [25] "p_balk" "X26" "name" "decade"
## [29] "sdecade"
colnames(pitchers)[colnames(pitchers) == "player_age"] <- "Player Age"
colnames(pitchers)[colnames(pitchers) == "p_strikeout"] <- "Strikeouts"
colnames(pitchers)[colnames(pitchers) == "p_walk"] <- "Walks"
colnames(pitchers)[colnames(pitchers) == "on_base_percent"] <- "OBP"
pitchers
## # A tibble: 3,325 x 29
## last_name first_name year `Player Age` p_game p_formatted_ip p_total_pa
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Burnett A.J. 2001 24 27 173. 733
## 2 Burnett A.J. 2002 25 31 204. 844
## 3 Burnett A.J. 2005 28 32 209 873
## 4 Burnett A.J. 2007 30 25 165. 691
## 5 Burnett A.J. 2008 31 35 221. 957
## 6 Burnett A.J. 2009 32 33 207 896
## 7 Burnett A.J. 2010 33 33 186. 829
## 8 Burnett A.J. 2011 34 33 190. 837
## 9 Burnett A.J. 2012 35 31 202. 851
## 10 Burnett A.J. 2013 36 30 191 801
## # ... with 3,315 more rows, and 22 more variables: p_ab <dbl>,
## # p_total_hits <dbl>, p_single <dbl>, p_double <dbl>, p_triple <dbl>,
## # p_home_run <dbl>, Strikeouts <dbl>, Walks <dbl>, p_k_percent <dbl>,
## # p_bb_percent <dbl>, batting_avg <dbl>, slg_percent <dbl>, OBP <dbl>,
## # on_base_plus_slg <dbl>, isolated_power <dbl>, p_earned_run <dbl>,
## # p_run <dbl>, p_balk <dbl>, X26 <lgl>, name <chr>, decade <dbl>,
## # sdecade <chr>
ggpairs(pitchers, mapping = aes(color = sdecade, alpha=.6, legend.position = "left"), columns = c(4,14,15,20))

ggsave(file="Crossplot Matrix.png", width=8, height=5, dpi=500)